********************************************************************************
* Created: October 2022
* Author: Michael Jensen - mjensen6@nd.edu
* Last Edited: MJ Feb 2023
* This file cleans the data we get from Khan Academy. 

* KA has pulled all data for August 2021 - May 2022 for accounts with a @aisd.net email,
* or students in an account where the teacher has an @aisd.net email.

* For students with an aisd.net email we give them their practice,
* For students without, we give them the average practice of students in their class.
********************************************************************************

*Import data from Khan
use "${rawdatapath}ka_data_yr1.dta", clear

*we can only get student ids from students who used an @aisd email. for students without, we want to give them the average practice time for their teacher's students on a given week.

collapse (mean) total_minutes learning_minutes math_minutes math_learning_minutes mlm_exercise mlm_video mlm_article math_skills_practiced msl_net msl_familiar msl_proficient msl_master msp_earned, by(new_t_email week_date)

sort new_t_email week_date
save "${datapath}ka_class_averages.dta", replace


** One change to 1st data set **
*To speed up the later cleaning, we cut down the khan data to only students and weeks who appear in the data we got from AISD
use "${rawdatapath}roster_year1.dta", clear

*Get rid of non math classes
gen math_class = 0
gen low_course = lower(course_description)
replace low_course = "temp " + low_course
gen temp = strpos(low_course, "math") 
replace temp = strpos(low_course, "algebra") if temp == 0
replace math_class = 1 if temp > 0
drop temp
drop low_course
drop if math_class ==0
collapse math_class, by(new_st_id)

*Make 44 weeks for each student
expand 44
sort new_st_id
quietly by new_st_id:  gen week = cond(_N==1,0,_n)
drop math_class

*Merge the dates for each week
mmerge week using "${rawdatapath}weeks.dta"
drop _merge

*Merge the Khan data for each student/week
mmerge new_st_id week_date using "${rawdatapath}ka_data_yr1.dta"
drop if _merge == 2
sort new_st_id week
quietly by new_st_id week:  gen dup = cond(_N==1,0,_n)
drop if dup > 1
drop new_t_email teacher_joined_ts student_joined_ts interval_start_date interval_end_date

save "${datapath}ka_minutes_by_week.dta", replace










* YEAR 2
use "${rawdatapath}ka_data_yr2.dta", clear

gen num_students = 1

collapse (sum) num_students (mean) total_minutes learning_minutes math_minutes math_learning_minutes mlm_exercise mlm_video mlm_article math_skills_practiced msl_net msl_familiar msl_proficient msl_master msp_earned, by(new_t_email week_date)

sort new_t_email week_date
drop if week_date < 22872
drop if week_date == 22969
drop if week_date >= 22997 & week_date <= 23004
drop if week_date == 23081
drop if week_date >= 23151

collapse (mean) num_students total_minutes learning_minutes math_minutes math_learning_minutes mlm_exercise mlm_video mlm_article math_skills_practiced msl_net msl_familiar msl_proficient msl_master msp_earned, by(new_t_email)

drop if num_students < 10

rename * *_yr2
rename new_t_email_yr2 new_t_email

save "${datapath}ka_class_averages_yr2.dta", replace

use "${datapath}ka_class_averages.dta", clear
collapse (mean) total_minutes learning_minutes math_minutes math_learning_minutes mlm_exercise mlm_video mlm_article math_skills_practiced msl_net msl_familiar msl_proficient msl_master msp_earned, by(new_t_email)
save "${datapath}ka_class_averages_yr1.dta", replace







/*
use "${datapath}students_minutes_full_weeks.dta", clear

gen num_students = 1

collapse (sum) num_students (mean) total_minutes learning_minutes math_minutes math_learning_minutes mlm_exercise mlm_video mlm_article math_skills_practiced msl_net msl_familiar msl_proficient msl_master msp_earned, by(new_t_email new_t_id week_date)

collapse (mean) num_students total_minutes learning_minutes math_minutes math_learning_minutes mlm_exercise mlm_video mlm_article math_skills_practiced msl_net msl_familiar msl_proficient msl_master msp_earned, by(new_t_email new_t_id)

rename * year1_*
rename year1_new_t_email new_t_email
rename year1_new_t_id new_t_id


*Drop if we don't have year 1 estimates for them
merge 1:1 new_t_email using "${datapath}ka_class_averages_yr2.dta"
drop if _merge == 2
drop _merge

*Drop if not treatment in year 1
merge 1:1 new_t_id using "${datapath}teacher_treatment_status.dta"
drop if treatment == 0
drop if _merge != 3
drop _merge


merge 1:1 new_t_email using "${dopath}/robustness/year2_set_up_meeting"
drop if _merge == 2

gen class_minutes00 = year1_total_minutes==0
gen class_minutes0199 = year1_total_minutes>=1
gen class_minutes1099 = year1_total_minutes>=10
gen class_minutes1599 = year1_total_minutes>=15
gen class_minutes2099 = year1_total_minutes>=20
gen class_minutes2599 = year1_total_minutes>=25
gen class_minutes3099 = year1_total_minutes>=30
gen class_minutes3599 = year1_total_minutes>=35
gen class_minutes4099 = year1_total_minutes>=40
gen class_minutes4599 = year1_total_minutes>=45
gen class_minutes5099 = year1_total_minutes>=50
gen class_minutes5599 = year1_total_minutes>=55
gen class_minutes6099 = year1_total_minutes>=60
replace total_minutes = 0 if total_minutes ==.

gen above5 = 0
replace above5 = 1 if total_minutes >= 5

*Drop if they received treatment in year 2
preserve
drop if _merge != 1

foreach v of varlist class_minutes00 class_minutes0199 class_minutes1099 class_minutes1599 class_minutes2099 class_minutes2599 class_minutes3099 class_minutes3599 class_minutes4099 class_minutes4599 class_minutes5099 class_minutes5599 class_minutes6099 {
	sum total_minutes if `v' == 1
	
}


histogram total_minutes if total_minutes<100, start(0) width(10) ylabel(0(10)100) percent name(graph1, replace) title("All Grades") xtitle("Minutes") xscale(range(0 100)) xlabel(0(20)100, nogrid) color(ltblue) ylabel(, nogrid)
histogram total_minutes if total_minutes<100 & teachergrade<=6, start(0) width(10)  ylabel(0(10)100) percent name(graph2, replace) title("Grades 3-6") xtitle("Minutes") xscale(range(0 100)) xlabel(0(20)100, nogrid) color(ltblue) ylabel(, nogrid)
histogram total_minutes if total_minutes<100 & teachergrade>6, start(0) width(10)  ylabel(0(10)100) percent name(graph3, replace) title("Grades 7-8") xtitle("Minutes") xscale(range(0 100)) xlabel(0(20)100, nogrid) color(ltblue) ylabel(, nogrid)

graph combine graph1 graph2 graph3, rows(1) cols(3) xsize(10) title("Average Weekly Practice Time, Year after Treatment") subtitle("Distribution: Class Averages")

graph export "${outputpath}final_figures/minutes_dist_buckets_year2.png", replace

histogram year1_total_minutes if year1_total_minutes<100, start(0) width(10) ylabel(0(10)100) percent name(graph1, replace) title("All Grades") xtitle("Minutes") xscale(range(0 100)) xlabel(0(20)100, nogrid) color(ltblue) ylabel(, nogrid)
histogram year1_total_minutes if year1_total_minutes<100 & teachergrade<=6, start(0) width(10)  ylabel(0(10)100) percent name(graph2, replace) title("Grades 3-6") xtitle("Minutes") xscale(range(0 100)) xlabel(0(20)100, nogrid) color(ltblue) ylabel(, nogrid)
histogram year1_total_minutes if year1_total_minutes<100 & teachergrade>6, start(0) width(10)  ylabel(0(10)100) percent name(graph3, replace) title("Grades 7-8") xtitle("Minutes") xscale(range(0 100)) xlabel(0(20)100, nogrid) color(ltblue) ylabel(, nogrid)

graph combine graph1 graph2 graph3, rows(1) cols(3) xsize(10) title("Average Weekly Practice Time, Year of Treatment") subtitle("Distribution: Class Averages")

graph export "${outputpath}final_figures/minutes_dist_buckets_year1.png", replace




sum total_minutes
restore 

*Keep if they received treatment in year 2

preserve

foreach v of varlist class_minutes00 class_minutes0199 class_minutes1099 class_minutes1599 class_minutes2099 class_minutes2599 class_minutes3099 class_minutes3599 class_minutes4099 class_minutes4599 class_minutes5099 class_minutes5599 class_minutes6099 {
	sum total_minutes if `v' == 1
	
}

restore 


preserve

foreach v of varlist class_minutes00 class_minutes0199 class_minutes1099 class_minutes1599 class_minutes2099 class_minutes2599 class_minutes3099 class_minutes3599 class_minutes4099 class_minutes4599 class_minutes5099 class_minutes5599 class_minutes6099 {
	sum above5 if `v' == 1
	
}
restore



drop _merge
merge 1:1 new_t_id using "${datapath}survey_results.dta"
drop if _merge == 2

gen khan_account = 1
replace khan_account = 0 if ka_use_purpose == "I have never used it"
preserve

foreach v of varlist class_minutes00 class_minutes0199 class_minutes1099 class_minutes1599 class_minutes2099 class_minutes2599 class_minutes3099 class_minutes3599 class_minutes4099 class_minutes4599 class_minutes5099 class_minutes5599 class_minutes6099 {
	sum khan_account if `v' == 1
	
}
restore


